
# connect to Postgres database where FEC mirror has been restored per FEC instructions
fec = dbConnect(RPostgres::Postgres(), dbname = 'fec_mirror', port = 5434)

# read a sample of one year to obtain column information
out = Id(schema = 'disclosure', table = 'fec_fitem_sched_a_2019_2020') %>%
  tbl(fec, .) %>%
  head(10) %>%
  collect()

# select columns to keep
cols_keep = c(1:15, 17:19, 21:32, 39, 44, 53:54, 56:59, 61, 68:70, 75:76, 79:81)

# iterate over cycles
for (year in seq(2008, 2020, 2)) {
  # write database to large csv file
  paste0(
    'copy(select {names(out)[cols_keep] %>% str_flatten(collapse = \', \')} ' ,
    'from disclosure.fec_fitem_sched_a_{year-1}_{year} ) ',
    'to \'fec_drop/{year}.csv\' with CSV DELIMITER \',\' HEADER'
  ) %>%
    glue() %>%
    dbExecute(fec, .)
  
  # specify schema for the arrow table
  out = fread('fec_drop/{year}.csv', nrows = 10) %>%
    mutate(across(everything(), as.character)) %>%
    arrow_table() %>%
    .$schema
  
  # open connection to the csv and write it in chunks to parquet files
  open_dataset('fec_drop/{year}.csv',
               format = 'csv',
               schema = out) %>%
    write_dataset('fec_arrow/cycle_table={year}',
                  format = 'parquet',
                  max_rows_per_file = 1e6)
  
}
